Windows
python -m pip install -U pip
Linux
pip install -U pip
In [67]:
    
!pip install wheel
    
    
In [221]:
    
%matplotlib inline
from pandas import *
from pylab import *
import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML
from __future__ import division
def side_by_side(*objs, **kwds):
    from pandas.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)
    
def htmldisp(x):
    try:
        return HTML(pd.DataFrame(x).to_html())
    except:
        print ("Alert!! \nInput should be a Pandas obj.")
plt.rc('figure', figsize=(10, 6))
# pandas.set_printoptions(notebook_repr_html=False)
    
In [69]:
    
plt.plot([1,2,4,8,16])
    
    Out[69]:
    
In [70]:
    
np.random.randn(5)
    
    Out[70]:
In [71]:
    
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)
    
In [72]:
    
'b' in s
    
    Out[72]:
In [73]:
    
s['b']
    
    Out[73]:
In [74]:
    
s.index
    
    Out[74]:
In [75]:
    
s
    
    Out[75]:
In [76]:
    
mapping = s.to_dict()
mapping
    
    Out[76]:
In [77]:
    
s = pd.Series(mapping, index=['b', 'e', 'a', 'd', 'f'])
s
    
    Out[77]:
In [78]:
    
notnull(s)
    
    Out[78]:
In [79]:
    
s[notnull(s)]
    
    Out[79]:
In [80]:
    
s.dropna()
    
    Out[80]:
In [81]:
    
s * 2
    
    Out[81]:
In [82]:
    
s[3:]
    
    Out[82]:
In [83]:
    
s[:3]
    
    Out[83]:
In [84]:
    
s.index
    
    Out[84]:
In [85]:
    
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)})
df['d'] = range(6)
df
    
    Out[85]:
In [86]:
    
df['b']    # by column
    
    Out[86]:
In [87]:
    
df[:3]
    
    Out[87]:
In [88]:
    
df[:-2]
    
    Out[88]:
In [89]:
    
df[-2:]   # the last 2 rows
    
    Out[89]:
In [90]:
    
df[['a','b']]
    
    Out[90]:
In [91]:
    
np.tile(['foo', 'bar'], 3)   # in numpy
    
    Out[91]:
In [92]:
    
df.xs(0)   # returning the first row
    
    Out[92]:
In [93]:
    
df.ix[2]    # does equal thing as above. In this case, 3rd row.
    
    Out[93]:
In [94]:
    
df.ix[2, 'b']
    
    Out[94]:
In [95]:
    
timeit df.ix[1]
    
    
In [96]:
    
df.get_value(2, 'b')
    
    Out[96]:
In [97]:
    
timeit df.ix[2, 'b']
    
    
In [98]:
    
timeit df.get_value(2, 'b')
    
    
In [99]:
    
df.ix[2:4, 'b']
    
    Out[99]:
In [100]:
    
df.ix[2:4, ['b', 'c']]
    
    Out[100]:
In [101]:
    
df.ix[2:4, 'b':'c']   # slices by column - essentially the same as above.
    
    Out[101]:
In [102]:
    
df.ix[2:4, 0:2]
    
    Out[102]:
In [103]:
    
df.ix[[0, 2, 4], ['b', 'c', 'd']]   # pass a list of rows and columns I want to select out
    
    Out[103]:
In [104]:
    
df['c'] > 0
    
    Out[104]:
In [105]:
    
df.ix[df['c'] > 0]   # boolean arrays
    
    Out[105]:
In [106]:
    
df.index
    
    Out[106]:
In [107]:
    
df.columns
    
    Out[107]:
In [108]:
    
pandas.date_range('1/1/2000',periods=6)
    
    Out[108]:
In [109]:
    
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               index=pandas.date_range('1/1/2000', periods=6))
df
    
    Out[109]:
In [110]:
    
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               columns=['a', 'b', 'c', 'd'])
df
    
    Out[110]:
In [111]:
    
isnull(df)
    
    Out[111]:
In [112]:
    
data = {}
for col in ['foo', 'bar', 'baz']:
    for row in ['a', 'b', 'c', 'd']:
        data.setdefault(col, {})[row] = np.random.randn()   # sorted cols: bar, baz, foo
data
    
    Out[112]:
In [113]:
    
del data ['foo']['c']   # delete an entry by index key (col) and row.
    
In [114]:
    
DataFrame(data)   # pass a dict
    
    Out[114]:
In [115]:
    
close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)
    
In [172]:
    
!head -n 10 stock_data.csv
    
    
In [117]:
    
!head  stock_data.csv
    
    
In [118]:
    
close_px
    
    Out[118]:
In [119]:
    
s1 = close_px['AAPL'][-20:]      # the last 20 rows
s2 = close_px['AAPL'][-25:-10]   # the last 25 - last 10 = 15 rows.
side_by_side(s1, s2)
    
    
In [120]:
    
s1 + s2
    
    Out[120]:
In [121]:
    
s1.add(s2, fill_value=0)
    
    Out[121]:
In [122]:
    
(s1 + s2).dropna()
    
    Out[122]:
In [123]:
    
df = close_px.ix[-10:, :3]
df
    
    Out[123]:
In [124]:
    
side_by_side(s1.reindex(s2.index), s2)   # align indexes of s1 and s2. Print 2 objects alongside each other.
    
    
In [125]:
    
side_by_side(s1.ix[s2.index], s2)   # same as above, supported in newer version of 0.12 ipython.
    
    
In [126]:
    
b, c  = s1.align(s2, join='inner')
side_by_side(b, c)
    
    
In [127]:
    
b, c  = s1.align(s2, join='outer')
side_by_side(b, c)
    
    
b, c = s1.align(s2, join='right') side_by_side(b, c)
In [128]:
    
df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]
df
    
    Out[128]:
In [129]:
    
df2 = df.ix[::2, ['IBM', 'MSFT']]
side_by_side(df, df2)
    
    
In [130]:
    
df + df2
    
    Out[130]:
In [131]:
    
b, c = df.align(df2, join='inner')
side_by_side(b, c)
    
    
In [132]:
    
df[:5].T
    
    Out[132]:
In [133]:
    
n = 10
foo = DataFrame(index=range(n))
foo['floats'] = np.random.randn(n)
foo['ints'] = np.arange(n)
foo['strings'] = ['foo', 'bar'] * (n / 2)
foo['bools'] = foo['floats'] > 0
foo['objects'] = pandas.date_range('1/1/2000', periods=n)
foo
    
    Out[133]:
In [134]:
    
foo.dtypes
    
    Out[134]:
N.B. transposing is not roundtrippable in this case (column-oriented data structure)
In [135]:
    
foo.T.T
    
    Out[135]:
In [136]:
    
foo.T.T.dtypes
    
    Out[136]:
In [137]:
    
df
    
    Out[137]:
In [138]:
    
df.apply(np.mean)
    
    Out[138]:
In [139]:
    
df.mean()   # same as above.   df.mean? to look up the function usage.
    
    Out[139]:
In [140]:
    
df.mean(1)   # get the mean of the rows based on a column. Also excludes missing data (NaN: not a number).
    
    Out[140]:
In [141]:
    
df.mean(1, skipna=False)   # flag skipna: to include NaN. In this data, no NaN.
    
    Out[141]:
In [142]:
    
df.apply(np.mean, axis=1)   # same as above.
    
    Out[142]:
You can get as fancy as you want
In [143]:
    
close_px    # Output: 1000 rows.
    
    Out[143]:
In [144]:
    
close_px.AAPL   # For easier typing, after dot ., use tab to select the name of the stock.
    
    Out[144]:
In [145]:
    
close_px.AAPL.idxmax() # gives index of its maximum value.
    
    Out[145]:
In [146]:
    
close_px.AAPL.index[1]
    
    Out[146]:
In [180]:
    
close_px.AAPL.index[close_px.AAPL.max()]  # If you are numpy aficionado, does the same thing.
    
    Out[180]:
In [148]:
    
close_px.AA.idxmax()
def peak_date(series):
    return series.idxmax()
    
In [149]:
    
close_px.apply(peak_date)
    
    Out[149]:
In [150]:
    
for column in close_px:
    print close_px[column].max()
    
    
In [151]:
    
def peak_date(series):
    return series.idxmax()    # new method: series.idxmax()
close_px.apply(peak_date, axis=0)
    
    Out[151]:
In [152]:
    
df.apply(lambda x: x.max())
    
    Out[152]:
In [153]:
    
df.apply(lambda x: x.max() - x.min()) # np.ptp
    
    Out[153]:
In [154]:
    
np.log(close_px)
    
    Out[154]:
In [155]:
    
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
hdf = DataFrame(np.random.randn(10, 3), index=index,
                columns=['A', 'B', 'C'])
hdf
    
    Out[155]:
In [156]:
    
hdf.ix['foo']
    
    Out[156]:
In [157]:
    
hdf.ix['foo'] = 0
hdf
    
    Out[157]:
In [158]:
    
hdf.ix['bar','one']['A']
    
    Out[158]:
In [159]:
    
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
                'foo', 'foo', 'qux', 'qux'],
               ['one', 'two', 'one', 'two',
                'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
                                  ('B', 'cat'), ('A', 'dog')])
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df
    
    Out[159]:
In [160]:
    
df2 = df.ix[[0, 1, 2, 4, 5, 7]]
df2
    
    Out[160]:
In [161]:
    
df.unstack()['B']
    
    Out[161]:
In [162]:
    
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                'C' : np.random.randn(8),
                'D' : np.random.randn(8)})
df
    
    Out[162]:
In [163]:
    
for key, group in df.groupby('A'):
    print key
    print group
    
    
In [164]:
    
df.groupby('A')['C'].describe().T
    
    Out[164]:
In [165]:
    
df.groupby('A').mean()
    
    Out[165]:
In [166]:
    
for key, group in df.groupby('A'):
    print key
    print group
    
    
In [167]:
    
df.groupby(['A', 'B']).mean()
    
    Out[167]:
In [168]:
    
#df.groupby(['A', 'B'], as_index=False).mean()
    
In [169]:
    
df.stack()
    
    Out[169]:
In [170]:
    
#df.stack().mean(1).unstack()
    
In [171]:
    
#df.groupby(level=1, axis=1).mean()
    
In [185]:
    
movs = pd.read_csv('movie_metadata.csv')
movs.columns
movs.columns = [c.replace("_", " ") for c in movs.columns]
movs.columns
    
    Out[185]:
In [238]:
    
_movs = movs[movs.title_year.ge(2000) & movs.country.isin(['USA', 'UK'])]
_df = pd.crosstab(_movs.title_year, _movs.country)
_df
#_df = pd.crosstab(_movs.title_year, _movs.country, values=_movs.actor_1_facebook_likes, aggfunc=[sum, mean, median])
#_df
#pd.crosstab(_movs.title_year, _movs.country).apply(lambda r: (r/r.sum())*100, axis=1)
    
    Out[238]:
In [231]:
    
movs[['imdb_score']].hist()
    
    Out[231]:
    
In [241]:
    
### IPython Gyan!
%who DataFrame
    
    
In [240]:
    
%whos DataFrame
    
    
In [239]:
    
%who_ls DataFrame
    
    Out[239]:
In [247]:
    
sal = pd.read_csv(r'Salaries.csv')
    
In [311]:
    
sal['EmpFirstLetter'] = sal.EmployeeName.str[:1]
sal.pivot_table(index=['EmpFirstLetter'], columns=['Year'], values=['TotalPay'], aggfunc=[median, mean])
    
    Out[311]:
In [314]:
    
sal['PayMark'] = 'More than 100k'
sal.loc[sal.TotalPay.le(100000), 'PayMark'] = 'Less than 100k'
sal.PayMark.unique()
    
    Out[314]:
In [316]:
    
sal.groupby('PayMark')['Id'].count()
#sal.PayMark.value_counts()
    
    Out[316]:
In [2]:
    
import sys
sys.version
    
    Out[2]:
In [3]:
    
import pandas as pd
pd.__version__
    
    Out[3]: